#!/usr/bin/env python3

##uses the scikit-learn implementation of TFIDF to TFIDF my dictionary
## produces both a title-level and an article-level set of predictions

import helpers
import pickle
import pandas as pd
import string
import numpy as np
from nltk.corpus import stopwords
import math
import csv
import re
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso, LogisticRegression
from sklearn.feature_selection import SelectFromModel
from sklearn.preprocessing import StandardScaler
from sklearn import linear_model
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV #not in grid_search any more maybe?
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MaxAbsScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.feature_extraction.text import CountVectorizer
import urllib

infile = "/gscratch/comdata/users/kaylea/taboo/processed_data/narrow_cleanedParsedWikts.tsv"
procDataStem = "/gscratch/comdata/users/kaylea/taboo/processed_data/"
modelfile = procDataStem + "unigram_fitted_model.sav"


my_stop = ['term', 'used', 'usually', 'particularly', 'etc', 'extremely', 'especially', 'one', 'en', 'something', 'often', 'synonym', 'like', 'etc.', 'person']
stop = my_stop + stopwords.words('english')


#uncomment here for test rig
#infile = "minimalTest.tsv"
#excerptsfile = "data/input/testexcerpts.tsv" 

## Prep the Classifier

print(f"reading {infile}....")
inDF = pd.read_csv(infile, sep='\t', header=0)
original = inDF.shape


inDF['dict_definition'] = inDF['dict_definition'].apply(lambda x: str(x)) #stringify everything
numInIt = re.compile('\d')
inDF['dict_definition']= inDF['dict_definition'].apply(lambda x: ' '.join(x for x in x.split() if not numInIt.search(x))) #eliminate numbers and words that contain numbers

inDF['dict_definition'] = inDF['dict_definition'].apply(lambda x: x.lower()) #lowercase everything
inDF['dict_definition']= inDF['dict_definition'].apply(lambda x: x.lstrip()) #eliminate leading whitespace

inDF['dict_definition']= inDF['dict_definition'].apply(lambda x: ' '.join(x for x in x.split() if x not in string.punctuation)) #elim punctuation
inDF['dict_definition'] = inDF['dict_definition'].apply(lambda x:' '.join(x for x in x.split() if not x in stop)) #eliminate stop and custom stop words

inDF = inDF.dropna()
inDF = inDF.reset_index(drop=True)
minusNA = inDF.shape
print(f"Starting with {original} dictionary entries, dropped NAs and now have {minusNA} entries.")


## Fit the Model

y = inDF.dict_taboo 
X = inDF.dict_definition

#cv = CountVectorizer(ngram_range=(1,3))
cv = CountVectorizer(ngram_range=(1,1))
cv_fit = cv.fit_transform(X)
freqData = {word:count for count,word in zip(cv_fit.sum(0).tolist()[0], cv.get_feature_names())}
#print(freqData)

with open(procDataStem + 'ngrams.tsv', 'w') as f:
	f.write("ngram\tcount\n")
	for k,v in freqData.items():
		f.write(f"{k}\t{v}\n")


#estimators = [("tf_idf", TfidfVectorizer(ngram_range=(1,3), stop_words=stop)), ("zeroToOne", MaxAbsScaler()), ("ridge", linear_model.Ridge())] 
#estimators = [("tf_idf", TfidfVectorizer(ngram_range=(1,3), stop_words=stop)), ("ridge", linear_model.Ridge())] 
estimators = [("tf_idf", TfidfVectorizer(ngram_range=(1,1), stop_words=stop)), ("ridge", linear_model.Ridge())] 
model = Pipeline(estimators)


print(f"X is {X}")
print("===========================")
print(f"y is {y}")

model = model.fit(X, y)
#if we do any hyper parameter tuning it can happen in here
finalModel = model

print(f"For the model I now have {finalModel}")
tf_idf_model = finalModel.named_steps["tf_idf"]
ridge_model = finalModel.named_steps["ridge"]
print(tf_idf_model)
print(ridge_model)
coefficients = pd.DataFrame({"names":tf_idf_model.get_feature_names(), "coef":ridge_model.coef_})
print("Best")
print(coefficients.sort_values("coef", ascending=False).head(30))
print("Worst")
print(coefficients.sort_values("coef", ascending=False).tail(30))

##model building done.

pickle.dump(finalModel, open(modelfile, 'wb'))
